caffe fc层源码注释

基本原理

note : 注意理解反向传播过程中，修改的对象是bottom，这一点与前向相反，前向修改的top
backpropgatation
矩阵乘法函数

成员变量

protected:
  int M_;//样本数量
  int K_;//单个输入特征长度
  int N_;//输出神经元数量
  bool bias_term_;//是否添加偏置，上图中的（+1）。
  Blob<Dtype> bias_multiplier_;//偏置的乘子

成员函数

其中的构造等成员函数基本上继承父类的，子类中无需实现，但是layersetup必须自己实现，主要包括了LayerSetUp,Forward_cpu,Backward_cpu,Reshape这四个成员函数

LayerSetUp

完成FC层变量初始化，从网络配置文件train_val.prototxt提取对应参数初始值完成fc层初始化。如输出维度、权重、偏置等

template <typename Dtype>
void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  // 从prototxt读取对应参数
  const int num_output = this->layer_param_.inner_product_param().num_output();
  bias_term_ = this->layer_param_.inner_product_param().bias_term();
  transpose_ = this->layer_param_.inner_product_param().transpose();
  // 输出维度
  N_ = num_output;
  // blob的CanonicalAxisIndex是为了标准化维度索引的输入，将一些非法维度输入转化为合法输入。
  const int axis = bottom[0]->CanonicalAxisIndex(
      this->layer_param_.inner_product_param().axis());
  // blob的count(int)是统计从某个维度开始，到结尾的总个数。这里第一个维度表示的是样本个数
  // 也即是M_,与全连接层是独立的，其后面的是表示输入特征的个数。
  // 如果输入图像的维度是(N, C, H, W)，则K_ = C * H * W  
  K_ = bottom[0]->count(axis);
  // 检查是否需要设置权重，如果已经初始化则直接跳过
  if (this->blobs_.size() > 0) {
    LOG(INFO) << "Skipping parameter initialization";
  } else {
    if (bias_term_) {
      this->blobs_.resize(2);
    } else {
      this->blobs_.resize(1);
    }
    // Initialize the weights
    vector<int> weight_shape(2);
    if (transpose_) {
      weight_shape[0] = K_;
      weight_shape[1] = N_;
    } else {
      weight_shape[0] = N_;
      weight_shape[1] = K_;
    }
    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
    // fill the weights
    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
        this->layer_param_.inner_product_param().weight_filler()));
    weight_filler->Fill(this->blobs_[0].get());
    // If necessary, intiialize and fill the bias term
    if (bias_term_) {
      vector<int> bias_shape(1, N_);
      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
          this->layer_param_.inner_product_param().bias_filler()));
      bias_filler->Fill(this->blobs_[1].get());
    }
  }  // 梯度计算标志位，fc需要计梯度(即需要被BP)的有哪些blob，这里设置所有的blob都要计算梯度
  this->param_propagate_down_.resize(this->blobs_.size(), true);
}

Reshape

调整输出层的size

template <typename Dtype>
void InnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  // Figure out the dimensions
  const int axis = bottom[0]->CanonicalAxisIndex(
      this->layer_param_.inner_product_param().axis());
  const int new_K = bottom[0]->count(axis);
  CHECK_EQ(K_, new_K)
      << "Input size incompatible with inner product parameters.";
  // The first "axis" dimensions are independent inner products; the total
  // number of these is M_, the product over these dimensions.
  M_ = bottom[0]->count(0, axis);
  // The top shape will be the bottom shape with the flattened axes dropped,
  // and replaced by a single axis with dimension num_output (N_).
  vector<int> top_shape = bottom[0]->shape();
  top_shape.resize(axis + 1);
  top_shape[axis] = N_;
  top[0]->Reshape(top_shape);
  // Set up the bias multiplier
  if (bias_term_) {
    vector<int> bias_shape(1, M_);
    bias_multiplier_.Reshape(bias_shape);
    // caffe_set(const int N, const Dtype alpha, Dtype* Y) 是用alpha的值来填充重Y开始的N个单元。
    caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
  }
}

Forward_cpu

template <typename Dtype>  
//实现的功能就是 y=wx+b  
//  x为输入，维度 M_*K_  
//  y为输出，维度 M_*N_  
//  w为权重，维度 K_*N_  
//  b为偏置，维度 N_*1_  
//一批次处理多个样本，在每一批次中权重矩阵与偏置矩阵是不变的  
void InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,  
    const vector<Blob<Dtype>*>& top) {  
  const Dtype* bottom_data = bottom[0]->cpu_data();  
  Dtype* top_data = top[0]->mutable_cpu_data();  
  const Dtype* weight = this->blobs_[0]->cpu_data();   //内存中的权重矩阵是N*K  
  // bottom_data为M*K矩阵，权重为K*N矩阵，top_data为M*N矩阵  
  // top_data = bottom_data * weight  
  //它的功能其实很直观，即C←αA×B+βC,前两个参数控制A,B是否转置  
  //其中A维度是MxK，B维度是KxN，C维度为MxN  
  //全连接层的forward包括了两步:  
  //这一步表示 y←wx，或者说是y←xw'  
  //bottom_data:M*K, weight:N*K, top_data:M*N
  caffe_cpu_gemm<Dtype>(CblasNoTrans, transpose_ ? CblasNoTrans : CblasTrans,  
      M_, N_, K_, (Dtype)1.,  
      bottom_data, weight, (Dtype)0., top_data);  
  // 如果包含有偏置项  
  if (bias_term_) {  
    // top_data += bias_multiplier * bias  
    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,  
        bias_multiplier_.cpu_data(),  
        this->blobs_[1]->cpu_data(), (Dtype)1., top_data);  
  }  
  // 因此两步合并的结果就是 top_data = bottom_data * weight + bias_multiplier * bias  
}

Backward_cpu

反向传播主要是为了更新W和b,其中的关键就是计算偏导，因此在这个函数中主要就是做了这三件事。计算diff(W),diff(b),\deta（残差）。

template <typename Dtype>  
void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,  
    const vector<bool>& propagate_down,  
    const vector<Blob<Dtype>*>& bottom) {  
   // Gradient with respect to weight  
  //更新W  
  //其中A维度是NxM，B维度是MxK，C维度为NxK  
  //top_diff:M*N, bottom_data:M*K, this->blobs_[0]->mutable_cpu_diff():N*K  
  //C=A'*B,this->blobs_[0]->mutable_cpu_diff()是权重梯度矩阵（N*K）  
  if (this->param_propagate_down_[0]) {  
    const Dtype* top_diff = top[0]->cpu_diff();  
    const Dtype* bottom_data = bottom[0]->cpu_data();
    //data传递的是数据，diff传递的是梯度，top_diff的维度是N*M，每一列代表一个样本的error term  
    // 求权重的偏导，weight_diff += top_diff * bottom_data  
    if (transpose_) {  
      caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans,  
          K_, N_, M_,  
          (Dtype)1., bottom_data, top_diff,  
          (Dtype)1., this->blobs_[0]->mutable_cpu_diff());  
    } else {  
      caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans,  
          N_, K_, M_,  
          (Dtype)1., top_diff, bottom_data,  
          (Dtype)1., this->blobs_[0]->mutable_cpu_diff());  
    }  
  }  
  // 求偏置项的偏导，bias_diff += top_diff * bias_multiplier  
  if (bias_term_ && this->param_propagate_down_[1]) {  
    const Dtype* top_diff = top[0]->cpu_diff();  
    // Gradient with respect to bias  
    caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,  
        bias_multiplier_.cpu_data(), (Dtype)1.,  
        this->blobs_[1]->mutable_cpu_diff());  
  }  
  if (propagate_down[0]) {  
    const Dtype* top_diff = top[0]->cpu_diff();  
    // 求bottom数据的偏导，bottom_data_diff = top_diff * weight  
    if (transpose_) {  
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans,  
          M_, K_, N_,  
          (Dtype)1., top_diff, this->blobs_[0]->cpu_data(),  
          (Dtype)0., bottom[0]->mutable_cpu_diff());  
    } else {  
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,  
          M_, K_, N_,  
          (Dtype)1., top_diff, this->blobs_[0]->cpu_data(),  
          (Dtype)0., bottom[0]->mutable_cpu_diff());  
    }  
  }  
}  
// 如果CPU_ONLY模式则禁止Forward_gpu和Backward_gpu函数  
#ifdef CPU_ONLY  
STUB_GPU(InnerProductLayer);  
#endif  
// 注册fclayer
INSTANTIATE_CLASS(InnerProductLayer);
REGISTER_LAYER_CLASS(InnerProduct);
}  // namespace caffe